In [37]:
import time
import datetime as dt
import pickle
import numpy as np
import random
import scipy as sp
from dict_stops import *
import pandas as pd
import os
import csv
from geopy.distance import vincenty

In [5]:
# Función que estandariza los valores de los paraderos de subida 
# y bajada
def update_vals(row,data = load_metro_dictionary()):
    if row.par_subida in data:
        row.par_subida = data[row.par_subida]
    if row.par_bajada in data:
        row.par_bajada = data[row.par_bajada]
    return row

In [6]:
# Función que estandariza los valores de los paraderos de subida 
# y bajada
def add_vals(row,latlong,paradero,data = dict_latlong_stops):
    stop_name = row[paradero]
    if stop_name in data:
        return data[stop_name][latlong]
    else :
        return np.nan

In [7]:
def frame_config(frame):
    frame['tiempo_subida'] = pd.to_datetime(frame.tiempo_subida)
    frame['tiempo_bajada'] = pd.to_datetime(frame.tiempo_bajada)
    frame = frame.apply(update_vals, axis=1)
    frame['weekday'] = frame.tiempo_subida.dt.dayofweek
    frame['lat_subida'] = frame.apply(add_vals,args=('lat','par_subida'),axis=1)
    frame['lat_bajada'] = frame.apply(add_vals,args=('lat','par_bajada'),axis=1)
    frame['long_subida'] = frame.apply(add_vals,args=('long','par_subida'),axis=1)
    frame['long_bajada'] = frame.apply(add_vals,args=('long','par_bajada'),axis=1)
    frame = frame.sort_values(by=['id', 'tiempo_subida'])
    frame['diferencia_tiempo'] = (frame['tiempo_subida']-frame['tiempo_subida'].shift()).fillna(0)
    return frame

In [22]:
def hour_to_seconds(an_hour):
    return int(an_hour.hour*3600 + an_hour.minute *60 + an_hour.second)

In [8]:
frame = frame_config(frame)

In [9]:
df_id_period = frame_config(df_id_period)

In [11]:
dframe = frame[['id','tiempo_subida','lat_subida','long_subida','tiempo_bajada','lat_bajada','long_bajada']]

In [12]:
df_id_period = df_id_period[['id','tiempo_subida','lat_subida','long_subida','tiempo_bajada','lat_bajada','long_bajada']]

In [3]:
if os.name == 'nt':
    path_subway_dictionary = 'C:\Users\catalina\Documents\Datois\Diccionario-EstacionesMetro.csv'
    path_csv_sequences = 'C:\Users\catalina\Documents\sequences\\'
else:
    path_subway_dictionary = '/home/cata/Documentos/Datois/Diccionario-EstacionesMetro.csv'
    path_csv_sequences = '/home/cata/Documentos/sequences/'

# Función que carga las estaciones de metro
# en un diccionario
def load_metro_dictionary():
    dict_metro = {}
    with open(path_subway_dictionary,mode='r') as infile:
        reader = csv.reader(infile,delimiter=';')
        dict_metro = {rows[5]:rows[7] for rows in reader}
    return dict_metro

In [2]:
frame = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_abril_allyearsids_10_100000.csv')

In [4]:
df_id_period = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_septiembre_allyearsids_10_100000.csv')
df_id_period['tiempo_subida'] = pd.to_datetime(df_id_period.tiempo_subida)
df_id_period = df_id_period.sort_values(by=['id', 'tiempo_subida'])

Probar función delete


In [13]:
def create_sequence(id_user, mls, nvisitas, sequence):
	profile = {'user_id':id_user,'mls':mls,'nvisitas':nvisitas,'sequence':sequence}
	return profile

In [14]:
def buscar_locacion(mls,location):
	try:
		index_location = mls.index(location)
	except ValueError:
		index_location = -1
	return index_location

In [15]:
def get_sequences(ids,lat_subidas,long_subidas,t_subidas,lat_bajadas,long_bajadas,t_bajadas):
    # se inicializan las variables con los valores de la primera transaccion
    profiles= [] # arreglo de diccionarios
    First = True
    # inicializo para despues usarlas
    last_id = -22
    mls = []
    nvisitas = []
    sequence = []
    times = []
    counter = 0
    for transaction in zip(ids,lat_subidas,long_subidas,t_subidas,lat_bajadas,long_bajadas,t_bajadas):
        id_user = transaction[0]
        lat_subida = transaction[1]
        long_subida = transaction[2]
        t_subida = transaction[3]
        lat_bajada = transaction[4]
        long_bajada = transaction[5]
        t_bajada = transaction[6]
        counter += 1
        if (lat_subida!=lat_subida or t_subida != t_subida):
            continue 
        par_subida = (lat_subida,long_subida)
        par_bajada = (lat_bajada,long_bajada)
        subida_3 = (lat_subida,long_subida,hour_to_seconds(t_subida))
        if First:
            last_id = id_user
            mls = [par_subida]
            sequence = [subida_3]
            last_stop = par_subida
            times.append(hour_to_seconds(t_subida))
            nvisitas = [0]
            counter = 1
            First = False
        if id_user!=last_id:       
            profiles.append(create_sequence(last_id,mls,nvisitas,sequence))
            last_id = id_user
            mls = [par_subida]
            sequence = [subida_3]
            last_stop = par_subida
            nvisitas = [0]
            counter = 1

        index_subida = buscar_locacion(mls,par_subida)
        # si la subida no había sido visitada se debe agregar al mls
        if (index_subida < 0):
            mls.append(par_subida)
            nvisitas.append(1)
            index_subida = len(mls) - 1
            sequence.append(subida_3)
            times.append(hour_to_seconds(t_subida))
            # si la bajada no se pudo calcular solo se considera la subida y se deja para calcular tpm en la proxima ronda 
            if (lat_bajada!=lat_bajada or t_bajada != t_bajada):
                last_stop = par_subida
                #print "Iteración n°: " + str(counter) + " , no se pudo estimar la bajada"
            else:
                bajada_3 = (lat_bajada,long_bajada,hour_to_seconds(t_bajada))
                last_stop = par_bajada
                sequence.append(bajada_3)
                times.append(hour_to_seconds(t_bajada))
                index_bajada = buscar_locacion(mls,par_bajada)
                # si la bajada no se había visitado antes, agregar bajada y sumar nvisitas 
                if (index_bajada < 0):
                    mls.append(par_bajada)
                    index_bajada = len(mls)-1
                    nvisitas.append(1)
                # sumar nvisita 
                else:
                    nvisitas[index_bajada] = nvisitas[index_bajada]+1
        else:
            nvisitas[index_subida] = nvisitas[index_subida]+1
            
            if(par_subida!=last_stop):
                sequence.append(subida_3)
                times.append(hour_to_seconds(t_subida))
            # subida estaba de antes y no hay bajada
            # REVISAR SI ESTO NO ES REDUNDANTE!
            if (lat_bajada!=lat_bajada or t_bajada!=t_bajada):
                last_stop = par_subida
            # hay subida y bajada
            else:
                bajada_3 = (lat_bajada,long_bajada,hour_to_seconds(t_bajada))
                sequence.append(bajada_3)
                times.append(hour_to_seconds(t_bajada))
                last_stop = par_bajada
                index_bajada = buscar_locacion(mls,par_bajada)
                # hay bajada pero no estaba antes
                if (index_bajada<0):
                    mls.append(par_bajada)
                    index_bajada = len(mls) - 1
                    nvisitas.append(1)
                # subida y bajada estaban de antes
                else:
                    nvisitas[index_bajada] = nvisitas[index_bajada]+1
    profiles.append(create_sequence(last_id,mls,nvisitas,sequence))

    return profiles

In [23]:
profiles = get_sequences(dframe['id'],dframe['lat_subida'],dframe['long_subida'],dframe['tiempo_subida'],dframe['lat_bajada'],dframe['long_bajada'],dframe['tiempo_bajada'])

In [24]:
profiles_tw2 = get_sequences(df_id_period['id'],df_id_period['lat_subida'],df_id_period['long_subida'],df_id_period['tiempo_subida'],df_id_period['lat_bajada'],df_id_period['long_bajada'],df_id_period['tiempo_bajada'])

In [26]:
def delete(sequence,i,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
    lat_distance = (sum_lat/n-(sum_lat-sequence[i][0])/(n-1))**2
    long_distance = (sum_long/n-(sum_long-sequence[i][1])/(n-1))**2
    temporal_distance = (sum_temp/n-(sum_temp-sequence[i][2])/(n-1))**2
    spatial_distance = lat_distance + long_distance
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

In [27]:
def insert(sequence,pi,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
    lat_distance = (sum_lat/n-(sum_lat+pi[0])/(n+1))**2
    long_distance = (sum_long/n-(sum_long+pi[0])/(n+1))**2
    temporal_distance = (sum_temp/n-(sum_temp+pi[0])/(n+1))**2
    spatial_distance = lat_distance + long_distance
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

In [28]:
def replace(sequence,pi,pj,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
    sum_lat_plus_pj = sum_lat - pi[0] +pj[0]
    sum_long_plus_pj = sum_long - pi[1] +pj[1]
    sum_temp_plus_pj = sum_temp - pi[2] +pj[2]
    lat_distance = (sum_lat/n-sum_lat_plus_pj/n)**2
    long_distance = (sum_long/n-sum_long_plus_pj/n)**2
    temporal_distance = (sum_temp/n-sum_temp_plus_pj/n)**2
    spatial_distance = lat_distance + long_distance
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

In [29]:
def cost(a_tuple):
	return a_tuple

In [30]:
# Funcion que compara la similitud entre un perfil y una secuencia de transacciones
# Se normaliza el calculo según el largo de la secuencia
# get_simliarity: [[int]] [string] [string] int int-> int 
def get_similarity(sequence_a,sequence_b,c,sum_lat,sum_long,sum_temp):
    length_sequence_a = len(sequence_a)
    length_sequence_b = len(sequence_b)
    D = np.zeros((length_sequence_a+1,length_sequence_b+1))
    for i in range(length_sequence_a):
        D[i+1,0] = D[i,0] + delete(sequence_a,i,c)
    for j in range(length_sequence_b):
        D[0,j+1] = D[0,j] + insert(sequence_a,sequence_b[j],c)
    for i in range(1,length_sequence_a+1):
        for j in range(1,length_sequence_b+1):
            m1 = D[i-1,j-1] + replace(sequence_a,sequence_a[i-1],sequence_b[j-1],c,sum_lat,sum_long,sum_temp)
            m2 = D[i-1,j] + delete(sequence_a,i-1,c,sum_lat,sum_long,sum_temp)
            m3 = D[i,j-1] + insert(sequence_a,sequence_b[j-1],c,sum_lat,sum_long,sum_temp)
            D[i,j] = min(m1,m2,m3)
    return D[length_sequence_a,length_sequence_b]

# Funcion que construye la matriz de identificacion en que cada indice corresponde
# a la similitud entre la i-esima tpm y la j-esima secuencia, obtenidas a partir de un
# perfil de usuario y un periodo de identificacion.
# len(users_profiles) == len(users_sequences)
# asume que los usuarios de users_profiles y users_sequences son los mismos
# get_identification_matrix; get_profiles(...) get_sequences(...) -> [[int]]
def get_identification_matrix(profiles_tw1,profiles_tw2,c):
    i = 0
    j = 0
    limit = min((len(profiles_tw1),len(profiles_tw2)))
    identification_matrix = np.zeros((limit,limit))
    for profile_i in profiles_tw1:
        sequence_a = profile_i['sequence']
        sum_lat = 0
        sum_long = 0
        sum_temp = 0
        for seq in sequence_a:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
        length_sequence_a = len(sequence_a)
        D_0 = np.zeros((length_sequence_a+1,1))
        for n in range(length_sequence_a):
            D_0[n+1,0] = D_0[n,0] + delete(sequence_a,n,c)
        for profile_j in profiles_tw2:
            sequence_b = profile_j['sequence']
            length_sequence_b = len(sequence_b)
            D = np.zeros((length_sequence_a+1,length_sequence_b+1))
            D[:,0] = D_0[:,0]
            for s in range(length_sequence_b):
                D[0,s+1] = D[0,s] + insert(sequence_a,sequence_b[s],c)
            for r in range(1,length_sequence_a+1):
                for t in range(1,length_sequence_b+1):
                    m1 = D[r-1,t-1] + replace(sequence_a,sequence_a[r-1],sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
                    m2 = D[r-1,t] + delete(sequence_a,r-1,c,sum_lat,sum_long,sum_temp)
                    m3 = D[r,t-1] + insert(sequence_a,sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
                    D[r,t] = min(m1,m2,m3)
            identification_matrix[i,j] = D[length_sequence_a,length_sequence_b]
            j += 1
            if(j >= limit):
                break
        i += 1
        j=0
        if(i >= limit):
            break
    return identification_matrix

In [31]:
init_time = time.time()
iden_matrix = get_identification_matrix(profiles[:20],profiles_tw2[:20],0)
print time.time()-init_time


4.88629007339

In [32]:
4.89/400


Out[32]:
0.012225

In [33]:
a = 0
b = 0
for i in range(len(iden_matrix)):
    if i == np.argmin(iden_matrix[i,:]):
        a += 1
    if i == np.argmin(iden_matrix[:,i]):
        b +=1
print str(a*100.0/len(iden_matrix))
print str(b*100.0/len(iden_matrix))


40.0
45.0

In [34]:
def delete_meters(sequence,i,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]

    original_centroid = (sum_lat/n,sum_long/n)
    modified_centroid = ((sum_lat-sequence[i][0])/(n-1),(sum_long-sequence[i][1])/(n-1))
    temporal_distance = (sum_temp/n-(sum_temp-sequence[i][2])/(n-1))**2
    spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

def insert_meters(sequence,pi,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
    original_centroid = (sum_lat/n,sum_long/n)
    modified_centroid = ((sum_lat+pi[0])/(n+1),(sum_long+pi[0])/(n+1))
    temporal_distance = (sum_temp/n-(sum_temp+pi[0])/(n+1))**2
    spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

def replace_meters(sequence,pi,pj,c,sum_lat=0,sum_long=0,sum_temp=0):
    n = len(sequence)
    if sum_lat == 0:
        for seq in sequence:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
    sum_lat_plus_pj = sum_lat - pi[0] +pj[0]
    sum_long_plus_pj = sum_long - pi[1] +pj[1]
    sum_temp_plus_pj = sum_temp - pi[2] +pj[2]
    original_centroid = (sum_lat/n,sum_long/n)
    modified_centroid = (sum_lat_plus_pj/n,sum_long_plus_pj/n)
    temporal_distance = (sum_temp/n-sum_temp_plus_pj/n)**2
    spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
    return ((1-c)*spatial_distance+c*temporal_distance)**0.5

In [50]:
# Funcion que construye la matriz de identificacion en que cada indice corresponde
# a la similitud entre la i-esima tpm y la j-esima secuencia, obtenidas a partir de un
# perfil de usuario y un periodo de identificacion.
# len(users_profiles) == len(users_sequences)
# asume que los usuarios de users_profiles y users_sequences son los mismos
# get_identification_matrix; get_profiles(...) get_sequences(...) -> [[int]]
def get_identification_matrix_meters(profiles_tw1,profiles_tw2,c):
    i = 0
    j = 0
    limit = min((len(profiles_tw1),len(profiles_tw2)))
    identification_matrix = np.zeros((limit,limit))
    for profile_i in profiles_tw1:
        sequence_a = profile_i['sequence']
        sum_lat = 0
        sum_long = 0
        sum_temp = 0
        for seq in sequence_a:
            sum_lat += seq[0]
            sum_long += seq[1]
            sum_temp += seq[2]
        length_sequence_a = len(sequence_a)
        D_0 = np.zeros((length_sequence_a+1,1))
        for n in range(length_sequence_a):
            D_0[n+1,0] = D_0[n,0] + delete_meters(sequence_a,n,c)
        for profile_j in profiles_tw2:
            sequence_b = profile_j['sequence']
            length_sequence_b = len(sequence_b)
            D = np.zeros((length_sequence_a+1,length_sequence_b+1))
            D[:,0] = D_0[:,0]
            for s in range(length_sequence_b):
                D[0,s+1] = D[0,s] + insert_meters(sequence_a,sequence_b[s],c)
            for r in range(1,length_sequence_a+1):
                for t in range(1,length_sequence_b+1):
                    m1 = D[r-1,t-1] + replace_meters(sequence_a,sequence_a[r-1],sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
                    m2 = D[r-1,t] + delete_meters(sequence_a,r-1,c,sum_lat,sum_long,sum_temp)
                    m3 = D[r,t-1] + insert_meters(sequence_a,sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
                    D[r,t] = min(m1,m2,m3)
            identification_matrix[i,j] = D[length_sequence_a,length_sequence_b]
            j += 1
            if(j >= limit):
                break
        i += 1
        j=0
        if(i >= limit):
            break
    return identification_matrix

In [51]:
init_time = time.time()
iden_matrix_meters = get_identification_matrix_meters(profiles[:100],profiles_tw2[:100],0)
print time.time()-init_time


1257.12802982

In [53]:
a = 0
b = 0
for i in range(len(iden_matrix)):
    if i == np.argmin(iden_matrix_meters[i,:]):
        a += 1
    if i == np.argmin(iden_matrix_meters[:,i]):
        b +=1
print str(a*100.0/len(iden_matrix))
print str(b*100.0/len(iden_matrix))


30.0
20.0

In [ ]: